This file is used to combine three datasets:
We load each individual sample, remove melanocytes, and merge the remaining cells.
library(dplyr)
library(patchwork)
library(ggplot2)
library(ComplexHeatmap)
.libPaths()
## [1] "/usr/local/lib/R/library"
In this section, we set the global settings of the analysis. We will store data there :
save_name = "data3"
out_dir = "."
n_threads = 5 # for tSNE
We combine the three sample information :
sample_info_1 = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_sample_info.rds"))
sample_info_2 = readRDS(paste0(out_dir, "/../5_wu/1_metadata/wu_sample_info.rds"))
sample_info_3 = readRDS(paste0(out_dir, "/../6_takahashi/1_metadata/takahashi_sample_info.rds"))
column_to_keep = c("project_name", "sample_type", "sample_identifier",
"platform", "gender", "location", "laboratory", "color")
sample_info = rbind.data.frame(sample_info_1[, column_to_keep],
sample_info_2[, column_to_keep],
sample_info_3[, column_to_keep],
stringsAsFactors = FALSE)
graphics::pie(rep(1, nrow(sample_info)),
col = sample_info$color,
labels = sample_info$project_name)
Here are custom colors for each cell type :
color_markers = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_color_markers.rds"))
data.frame(cell_type = names(color_markers),
color = unlist(color_markers)) %>%
ggplot2::ggplot(., aes(x = cell_type, y = 0, fill = cell_type)) +
ggplot2::geom_point(pch = 21, size = 5) +
ggplot2::scale_fill_manual(values = unlist(color_markers), breaks = names(color_markers)) +
ggplot2::theme_classic() +
ggplot2::theme(legend.position = "none",
axis.line = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(angle = 30, hjust = 1))
We load the markers and specific colors for each cell type :
cell_markers = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_cell_markers.rds"))
lengths(cell_markers)
## CD4 T cells CD8 T cells Langerhans cells
## 13 13 9
## macrophages B cells cuticle
## 10 16 15
## cortex medulla IRS
## 16 10 16
## proliferative HF-SCs IFE basal
## 20 17 16
## IFE granular spinous ORS melanocytes
## 17 15 10
## sebocytes
## 8
We load markers to display on the dotplot :
dotplot_markers = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_dotplot_markers.rds"))
dotplot_markers
## $`CD4 T cells`
## [1] "PTPRC" "CD3E" "CD4"
##
## $`CD8 T cells`
## [1] "CD3E" "CD8A"
##
## $`Langerhans cells`
## [1] "CD207" "CPVL"
##
## $macrophages
## [1] "TREM2" "MSR1"
##
## $`B cells`
## [1] "CD79A" "CD79B"
##
## $cuticle
## [1] "MSX2" "KRT32" "KRT35"
##
## $cortex
## [1] "KRT31" "PRR9"
##
## $medulla
## [1] "BAMBI" "ADLH1A3"
##
## $IRS
## [1] "KRT71" "KRT73"
##
## $proliferative
## [1] "TOP2A" "MCM5" "TK1"
##
## $`HF-SCs`
## [1] "KRT14" "CXCL14"
##
## $`IFE basal`
## [1] "COL17A1" "KRT15"
##
## $`IFE granular spinous`
## [1] "SPINK5" "KRT1"
##
## $ORS
## [1] "KRT16" "KRT6C"
##
## $melanocytes
## [1] "DCT" "MLANA"
##
## $sebocytes
## [1] "CLMP" "PPARG"
For each sample, we :
We load individual datasets :
sobj_list = list()
# Our data
project_names_oi = sample_info_1$project_name
sobj_list[["here"]] = lapply(project_names_oi, FUN = function(one_project_name) {
subsobj = readRDS(paste0(out_dir, "/../2_individual/datasets/",
one_project_name, "_sobj_filtered.rds"))
return(subsobj)
})
names(sobj_list[["here"]]) = project_names_oi
# Wu data
project_names_oi = sample_info_2$project_name
sobj_list[["wu"]] = lapply(project_names_oi, FUN = function(one_project_name) {
subsobj = readRDS(paste0(out_dir, "/../5_wu/2_individual/datasets/",
one_project_name, "_sobj_filtered.rds"))
return(subsobj)
})
names(sobj_list[["wu"]]) = project_names_oi
# Takahashi data
project_names_oi = sample_info_3$project_name
sobj_list[["takahashi"]] = lapply(project_names_oi, FUN = function(one_project_name) {
subsobj = readRDS(paste0(out_dir, "/../6_takahashi/2_individual/datasets/",
one_project_name, "_sobj_filtered.rds"))
return(subsobj)
})
names(sobj_list[["takahashi"]]) = project_names_oi
# Unlist
sobj_list = unlist(sobj_list, recursive = FALSE)
lapply(sobj_list, FUN = dim) %>%
do.call(rbind, .) %>%
rbind(., colSums(.))
## [,1] [,2]
## here.2021_31 27955 1043
## here.2021_36 27955 602
## here.2021_41 27955 2256
## here.2022_03 27955 3977
## here.2022_14 27955 2588
## here.2022_01 27955 1213
## here.2022_02 27955 2286
## wu.F18 27955 1372
## wu.F31B 27955 4786
## wu.F31W 27955 3520
## wu.F59 27955 2445
## wu.F62B 27955 3279
## wu.F62W 27955 2360
## takahashi.GSM3717034 10320 71
## takahashi.GSM3717035 12129 275
## takahashi.GSM3717036 14170 510
## takahashi.GSM3717037 32458 4084
## takahashi.GSM3717038 32458 1094
## 464950 37761
We represent cells in the tSNE :
name2D = "RNA_pca_20_tsne"
We look at cell type annotation for each dataset :
plot_list = lapply(sobj_list, FUN = function(one_sobj) {
mytitle = as.character(unique(one_sobj$project_name))
mysubtitle = ncol(one_sobj)
if (!(name2D %in% names(one_sobj@reductions))) {
name2D = names(one_sobj@reductions)[2]
}
p = Seurat::DimPlot(one_sobj, group.by = "cell_type",
reduction = name2D) +
ggplot2::scale_color_manual(values = color_markers,
breaks = names(color_markers),
name = "Cell Type") +
ggplot2::labs(title = mytitle,
subtitle = paste0(mysubtitle, " cells")) +
ggplot2::theme(aspect.ratio = 1,
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)) +
Seurat::NoAxes()
return(p)
})
plot_list[[length(plot_list) + 1]] = patchwork::guide_area()
patchwork::wrap_plots(plot_list, ncol = 4) +
patchwork::plot_layout(guides = "collect") &
ggplot2::theme(legend.position = "right")
and clustering :
plot_list = lapply(sobj_list, FUN = function(one_sobj) {
mytitle = as.character(unique(one_sobj$project_name))
mysubtitle = ncol(one_sobj)
if (!(name2D %in% names(one_sobj@reductions))) {
name2D = names(one_sobj@reductions)[2]
}
p = Seurat::DimPlot(one_sobj, group.by = "seurat_clusters",
reduction = name2D, label = TRUE) +
ggplot2::labs(title = mytitle,
subtitle = paste0(mysubtitle, " cells")) +
ggplot2::theme(aspect.ratio = 1,
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)) +
Seurat::NoAxes() + Seurat::NoLegend()
return(p)
})
patchwork::wrap_plots(plot_list, ncol = 4)
For each individual dataset, we remove melanocytes. First, we smooth cell type annotation at a cluster level :
sobj_list = lapply(sobj_list, FUN = function(one_sobj) {
cluster_type = table(one_sobj$cell_type, one_sobj$seurat_clusters) %>%
prop.table(., margin = 2) %>%
apply(., 2, which.max)
cluster_type = setNames(nm = names(cluster_type),
levels(one_sobj$cell_type)[cluster_type])
one_sobj$cluster_type = cluster_type[one_sobj$seurat_clusters]
## Output
return(one_sobj)
})
To locate melanocytes, we look at their score, cell type annotation, and clustering.
plot_list = lapply(sobj_list, FUN = function(one_sobj) {
project_name = as.character(unique(one_sobj$project_name))
plot_sublist = list()
if (!(name2D %in% names(one_sobj@reductions))) {
name2D = names(one_sobj@reductions)[2]
}
# Score
plot_sublist[[1]] = Seurat::FeaturePlot(one_sobj, reduction = name2D,
features = "score_melanocytes") +
ggplot2::labs(title = project_name,
subtitle = "Melanocytes score") +
Seurat::NoAxes() +
ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
ggplot2::theme(aspect.ratio = 1,
plot.subtitle = element_text(hjust = 0.5))
# Cell type
plot_sublist[[2]] = Seurat::DimPlot(one_sobj,
reduction = name2D,
group.by = "cell_type",
order = "melanocytes") +
ggplot2::scale_color_manual(values = c("purple", rep("gray92", length(color_markers) - 1)),
breaks = c("melanocytes", setdiff(names(color_markers), "melanocytes"))) +
ggplot2::labs(title = "Cell type annotation",
subtitle = paste0(sum(one_sobj$cell_type == "melanocytes"),
" melanocytes")) +
Seurat::NoAxes() + Seurat::NoLegend() +
ggplot2::theme(aspect.ratio = 1,
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
# Clusters
plot_sublist[[3]] = Seurat::DimPlot(one_sobj,
reduction = name2D,
group.by = "seurat_clusters",
label = TRUE) +
ggplot2::labs(title = "Clusters") +
Seurat::NoAxes() + Seurat::NoLegend() +
ggplot2::theme(aspect.ratio = 1,
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
# Cluster type
plot_sublist[[4]] = Seurat::DimPlot(one_sobj,
reduction = name2D,
group.by = "cluster_type") +
ggplot2::scale_color_manual(values = c("purple", rep("gray92", length(color_markers) - 1)),
breaks = c("melanocytes", setdiff(names(color_markers), "melanocytes"))) +
ggplot2::labs(title = "Cluster annotation",
subtitle = paste0(sum(one_sobj$cluster_type == "melanocytes"),
" melanocytes")) +
Seurat::NoAxes() + Seurat::NoLegend() +
ggplot2::theme(aspect.ratio = 1,
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
return(plot_sublist)
}) %>% unlist(., recursive = FALSE)
patchwork::wrap_plots(plot_list, ncol = 4)
We remove melanocytes based on cluster annotation for 10X datasets and based on the cell type annotation for Drop-Seq datasets :
sobj_list = lapply(sobj_list, FUN = function(one_sobj) {
if (one_sobj@project.name %in% c("GSM3717034", "GSM3717035", "GSM3717036")) {
one_sobj$is_of_interest = (one_sobj$cell_type != "melanocytes")
} else {
one_sobj$is_of_interest = (one_sobj$cluster_type != "melanocytes")
}
if (sum(one_sobj$is_of_interest) > 0) {
one_sobj = subset(one_sobj, is_of_interest == TRUE)
} else {
one_sobj = NA
}
one_sobj$is_of_interest = NULL
return(one_sobj)
})
lapply(sobj_list, FUN = dim) %>%
do.call(rbind, .) %>%
rbind(., colSums(.))
## [,1] [,2]
## here.2021_31 27955 885
## here.2021_36 27955 528
## here.2021_41 27955 1982
## here.2022_03 27955 3457
## here.2022_14 27955 2422
## here.2022_01 27955 835
## here.2022_02 27955 2002
## wu.F18 27955 1372
## wu.F31B 27955 4624
## wu.F31W 27955 3520
## wu.F59 27955 2445
## wu.F62B 27955 3221
## wu.F62W 27955 2360
## takahashi.GSM3717034 10320 65
## takahashi.GSM3717035 12129 270
## takahashi.GSM3717036 14170 497
## takahashi.GSM3717037 32458 3781
## takahashi.GSM3717038 32458 1023
## 464950 35289
We remove melanocytes from annotation :
cell_markers = cell_markers[names(cell_markers) != "melanocytes"]
color_markers = color_markers[names(color_markers) != "melanocytes"]
dotplot_markers = dotplot_markers[names(dotplot_markers) != "melanocytes"]
We re-annotate cells for cell type, since melanocytes have been removed :
sobj_list = lapply(sobj_list, FUN = function(one_sobj) {
# Remove old annotation
one_sobj@meta.data[, grep(colnames(one_sobj@meta.data), pattern = "score", value = TRUE)] = NULL
# Re-annot
one_sobj = aquarius::cell_annot_custom(one_sobj,
newname = "cell_type",
markers = cell_markers,
use_negative = TRUE,
add_score = FALSE,
verbose = TRUE)
# Set factor levels
one_sobj$cell_type = factor(one_sobj$cell_type, levels = names(cell_markers))
return(one_sobj)
})
Our dataset and Wu dataset were processed using the same annotation. In Takahashi dataset, all genes are not shared across datasets:
Note: With the ggvenn package, this is not possible to
make a Venn diagram with 5 sets.
ggvenn::ggvenn(data = list(
here.2021_31 = sobj_list[["here.2021_31"]]@assays[["RNA"]]@meta.features$Ensembl_ID,
wu.F18 = sobj_list[["wu.F18"]]@assays[["RNA"]]@meta.features$Ensembl_ID,
takahashi.GSM3717034 = sobj_list[["takahashi.GSM3717034"]]@assays[["RNA"]]@meta.features$Ensembl_ID,
takahashi.GSM3717038 = sobj_list[["takahashi.GSM3717038"]]@assays[["RNA"]]@meta.features$Ensembl_ID),
stroke_size = 0.5, set_name_size = 4) +
ggplot2::labs(title = "Gene Ensembl IDs between the 4 datasets") +
ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"))
We keep common genes between all datasets + common genes between the 10X datasets, based on the EnsemblID
# All Ensembl IDs
common_genes = lapply(sobj_list, FUN = function(one_sobj) {
ensembl_id = one_sobj@assays[["RNA"]]@meta.features$Ensembl_ID
return(ensembl_id)
})
names(common_genes) = names(sobj_list)
# Common between 10X datasets
common_genes_10x = common_genes[!(names(common_genes) %in% c("takahashi.GSM3717034",
"takahashi.GSM3717035",
"takahashi.GSM3717036"))] %>%
Reduce(intersect, .)
# Common between all
common_genes = Reduce(intersect, common_genes)
# Venn diagram
ggvenn::ggvenn(data = list(
common_all = common_genes,
common_10X = common_genes_10x),
stroke_size = 0.5, set_name_size = 4) +
ggplot2::labs(title = "Gene Ensembl IDs") +
ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"))
We keep the union of all these genes :
common_genes = union(common_genes, common_genes_10x)
rm(common_genes_10x)
length(common_genes)
## [1] 25605
To which gene names they correspond, in one of our dataset ?
gene_corresp = sobj_list[["here.2021_31"]]@assays$RNA@meta.features %>%
dplyr::filter(Ensembl_ID %in% common_genes) %>%
dplyr::select(Ensembl_ID, gene_name)
dim(gene_corresp)
## [1] 25605 2
head(gene_corresp)
## Ensembl_ID gene_name
## MIR1302-2HG ENSG00000243485 MIR1302-2HG
## FAM138A ENSG00000237613 FAM138A
## OR4F5 ENSG00000186092 OR4F5
## AL627309.1 ENSG00000238009 AL627309.1
## AL627309.3 ENSG00000239945 AL627309.3
## AL627309.4 ENSG00000241599 AL627309.4
We subset Seurat object for the Ensembl IDs of interest.
sobj_list = lapply(sobj_list, FUN = function(one_sobj) {
# Extract metadata
one_metadata = one_sobj@meta.data
# Extract and subset gene annotation
one_annotation = one_sobj@assays[["RNA"]]@meta.features %>%
dplyr::filter(Ensembl_ID %in% gene_corresp$Ensembl_ID)
# Subset gene corresp for reordering
one_gene_corresp = gene_corresp %>%
dplyr::filter(Ensembl_ID %in% one_annotation$Ensembl_ID)
# Extract count matrix and subset genes
one_count_matrix = one_sobj@assays[["RNA"]]@counts
one_count_matrix = one_count_matrix[rownames(one_annotation), ]
# Reorder according to the gene correspondence
gene_order = match(one_gene_corresp$Ensembl_ID,
one_annotation$Ensembl_ID)
# Reorder the count matrix and annotation
one_annotation = one_annotation[gene_order, ]
one_count_matrix = one_count_matrix[gene_order, ]
rownames(one_count_matrix) = rownames(one_gene_corresp)
# Build again the Seurat object
one_sobj = Seurat::CreateSeuratObject(counts = one_count_matrix,
meta.data = one_metadata)
one_sobj@assays[["RNA"]]@meta.features = one_gene_corresp
return(one_sobj)
})
lapply(sobj_list, FUN = dim) %>%
do.call(rbind, .) %>%
rbind(., colSums(.))
## [,1] [,2]
## here.2021_31 25605 885
## here.2021_36 25605 528
## here.2021_41 25605 1982
## here.2022_03 25605 3457
## here.2022_14 25605 2422
## here.2022_01 25605 835
## here.2022_02 25605 2002
## wu.F18 25605 1372
## wu.F31B 25605 4624
## wu.F31W 25605 3520
## wu.F59 25605 2445
## wu.F62B 25605 3221
## wu.F62W 25605 2360
## takahashi.GSM3717034 8926 65
## takahashi.GSM3717035 10676 270
## takahashi.GSM3717036 11842 497
## takahashi.GSM3717037 25605 3781
## takahashi.GSM3717038 25605 1023
## 415519 35289
We combine all datasets :
sobj = base::merge(sobj_list[[1]],
y = sobj_list[c(2:length(sobj_list))],
add.cell.ids = names(sobj_list))
sobj
## An object of class Seurat
## 25605 features across 35289 samples within 1 assay
## Active assay: RNA (25605 features, 0 variable features)
We add again the correspondence between gene names and gene ID. We take the correspondence from one individual 10X dataset.
sobj@assays$RNA@meta.features = sobj_list[[1]]@assays$RNA@meta.features[, c("Ensembl_ID", "gene_name")]
head(sobj@assays$RNA@meta.features)
## Ensembl_ID gene_name
## MIR1302-2HG ENSG00000243485 MIR1302-2HG
## FAM138A ENSG00000237613 FAM138A
## OR4F5 ENSG00000186092 OR4F5
## AL627309.1 ENSG00000238009 AL627309.1
## AL627309.3 ENSG00000239945 AL627309.3
## AL627309.4 ENSG00000241599 AL627309.4
We remove the list of objects :
rm(sobj_list)
We keep a subset of meta.data and reset levels :
sobj@meta.data = sobj@meta.data[, c("orig.ident", "nCount_RNA", "nFeature_RNA", "log_nCount_RNA",
"project_name", "sample_identifier", "sample_type",
"laboratory", "location", "Seurat.Phase", "cyclone.Phase",
"percent.mt", "percent.rb", "cell_type")]
sobj$orig.ident = factor(sobj$orig.ident, levels = levels(sample_info$project_name))
sobj$project_name = factor(sobj$project_name, levels = levels(sample_info$project_name))
sobj$sample_identifier = factor(sobj$sample_identifier, levels = levels(sample_info$sample_identifier))
sobj$sample_type = factor(sobj$sample_type, levels = unique(sample_info$sample_type))
sobj$cell_type = factor(sobj$cell_type, levels = names(color_markers))
summary(sobj@meta.data)
## orig.ident nCount_RNA nFeature_RNA log_nCount_RNA
## F31B : 4624 Min. : 318 Min. : 250 Min. : 5.762
## GSM3717037: 3781 1st Qu.: 3274 1st Qu.:1226 1st Qu.: 8.094
## F31W : 3520 Median : 8707 Median :2298 Median : 9.072
## 2022_03 : 3457 Mean : 11627 Mean :2438 Mean : 8.877
## F62B : 3221 3rd Qu.: 16354 3rd Qu.:3371 3rd Qu.: 9.702
## F59 : 2445 Max. :139803 Max. :7942 Max. :11.848
## (Other) :14241
## project_name sample_identifier sample_type laboratory
## F31B : 4624 Wu_HD_2 : 4624 HS: 9274 Length:35289
## GSM3717037: 3781 Takahashi_HD_4: 3781 HD:26015 Class :character
## F31W : 3520 Wu_HD_3 : 3520 Mode :character
## 2022_03 : 3457 HS_4 : 3457
## F62B : 3221 Wu_HD_5 : 3221
## F59 : 2445 Wu_HD_4 : 2445
## (Other) :14241 (Other) :14241
## location Seurat.Phase cyclone.Phase percent.mt
## Length:35289 Length:35289 Length:35289 Min. : 0.000
## Class :character Class :character Class :character 1st Qu.: 2.778
## Mode :character Mode :character Mode :character Median : 4.516
## Mean : 5.360
## 3rd Qu.: 6.868
## Max. :20.000
##
## percent.rb cell_type
## Min. : 0.4948 ORS :9121
## 1st Qu.:19.1530 IFE granular spinous:5349
## Median :23.9507 IFE basal :4117
## Mean :23.2444 HF-SCs :3292
## 3rd Qu.:28.2717 cuticle :3021
## Max. :48.0392 proliferative :2590
## (Other) :7799
We remove genes that are expressed in less than 5 cells :
sobj = aquarius::filter_features(sobj, min_cells = 5)
## [1] 25605 35289
## [1] 19849 35289
sobj
## An object of class Seurat
## 19849 features across 35289 samples within 1 assay
## Active assay: RNA (19849 features, 0 variable features)
How many cells by sample ?
table(sobj$project_name)
##
## 2021_31 2021_36 2021_41 2022_03 2022_14 2022_01 2022_02
## 885 528 1982 3457 2422 835 2002
## F18 F31B F31W F59 F62B F62W GSM3717034
## 1372 4624 3520 2445 3221 2360 65
## GSM3717035 GSM3717036 GSM3717037 GSM3717038
## 270 497 3781 1023
We represent this information as a barplot :
aquarius::plot_barplot(df = table(sobj$project_name,
sobj$cell_type) %>%
as.data.frame.table() %>%
`colnames<-`(c("Sample", "Cell Type", "Number")),
x = "Sample", y = "Number", fill = "Cell Type",
position = position_fill()) +
ggplot2::scale_fill_manual(values = unlist(color_markers),
breaks = names(color_markers),
name = "Cell Type")
This is the same barplot with another position :
aquarius::plot_barplot(df = table(sobj$project_name,
sobj$cell_type) %>%
as.data.frame.table() %>%
`colnames<-`(c("Sample", "Cell Type", "Number")),
x = "Sample", y = "Number", fill = "Cell Type",
position = position_stack()) +
ggplot2::scale_fill_manual(values = unlist(color_markers),
breaks = names(color_markers),
name = "Cell Type")
We normalize the count matrix for remaining cells and select highly variable features :
sobj = Seurat::NormalizeData(sobj,
normalization.method = "LogNormalize")
sobj = Seurat::FindVariableFeatures(sobj, nfeatures = 2000)
sobj = Seurat::ScaleData(sobj)
sobj
## An object of class Seurat
## 19849 features across 35289 samples within 1 assay
## Active assay: RNA (19849 features, 2000 variable features)
We perform a PCA :
sobj = Seurat::RunPCA(sobj,
assay = "RNA",
reduction.name = "RNA_pca",
npcs = 100,
seed.use = 1337L)
sobj
## An object of class Seurat
## 19849 features across 35289 samples within 1 assay
## Active assay: RNA (19849 features, 2000 variable features)
## 1 dimensional reduction calculated: RNA_pca
We choose the number of dimensions such that they summarize 60 % of the variability :
stdev = sobj@reductions[["RNA_pca"]]@stdev
stdev_prop = cumsum(stdev)/sum(stdev)
ndims = which(stdev_prop > 0.60)[1]
ndims
## [1] 37
We can visualize this on the elbow plot :
elbow_p = Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca") +
ggplot2::geom_point(x = ndims, y = stdev[ndims], col = "red")
x_text = ggplot_build(elbow_p)$layout$panel_params[[1]]$x$get_labels() %>% as.numeric()
elbow_p = elbow_p +
ggplot2::scale_x_continuous(breaks = sort(c(x_text, ndims)), limits = c(0, 100))
x_color = ifelse(ggplot_build(elbow_p)$layout$panel_params[[1]]$x$get_labels() %>%
as.numeric() %>% round(., 2) == round(ndims, 2), "red", "black")
elbow_p = elbow_p +
ggplot2::theme_classic() +
ggplot2::theme(axis.text.x = element_text(color = x_color))
elbow_p
We generate a tSNE and a UMAP with 37 principal components :
sobj = Seurat::RunTSNE(sobj,
reduction = "RNA_pca",
dims = 1:ndims,
seed.use = 1337L,
num_threads = n_threads, # Rtsne::Rtsne option
reduction.name = paste0("RNA_pca_", ndims, "_tsne"))
sobj = Seurat::RunUMAP(sobj,
reduction = "RNA_pca",
dims = 1:ndims,
seed.use = 1337L,
reduction.name = paste0("RNA_pca_", ndims, "_umap"))
(Time to run : 124.05 s)
We can visualize the two representations :
tsne = Seurat::DimPlot(sobj, group.by = "project_name",
reduction = paste0("RNA_pca_", ndims, "_tsne")) +
ggplot2::scale_color_manual(values = sample_info$color,
breaks = sample_info$project_name) +
Seurat::NoAxes() + ggplot2::ggtitle("PCA - tSNE") +
ggplot2::theme(aspect.ratio = 1,
plot.title = element_text(hjust = 0.5),
legend.position = "none")
umap = Seurat::DimPlot(sobj, group.by = "project_name",
reduction = paste0("RNA_pca_", ndims, "_umap")) +
ggplot2::scale_color_manual(values = sample_info$color,
breaks = sample_info$project_name) +
Seurat::NoAxes() + ggplot2::ggtitle("PCA - UMAP") +
ggplot2::theme(aspect.ratio = 1,
plot.title = element_text(hjust = 0.5))
tsne | umap
We remove sample specific effect on the pca using Harmony :
`%||%` = function(lhs, rhs) {
if (!is.null(x = lhs)) {
return(lhs)
} else {
return(rhs)
}
}
set.seed(1337L)
sobj = harmony::RunHarmony(object = sobj,
group.by.vars = "project_name",
plot_convergence = TRUE,
reduction = "RNA_pca",
assay.use = "RNA",
reduction.save = "harmony",
max.iter.harmony = 50,
project.dim = FALSE)
(Time
to run : 198.87 s)
From this batch-effect removed projection, we generate a tSNE and a UMAP.
sobj = Seurat::RunUMAP(sobj,
seed.use = 1337L,
dims = 1:ndims,
reduction = "harmony",
reduction.name = paste0("harmony_", ndims, "_umap"),
reduction.key = paste0("harmony_", ndims, "umap_"))
sobj = Seurat::RunTSNE(sobj,
dims = 1:ndims,
seed.use = 1337L,
num_threads = n_threads, # Rtsne::Rtsne option
reduction = "harmony",
reduction.name = paste0("harmony_", ndims, "_tsne"),
reduction.key = paste0("harmony", ndims, "tsne_"))
(Time to run : 127.66 s)
We visualize the corrected projections :
tsne = Seurat::DimPlot(sobj, group.by = "project_name",
reduction = paste0("harmony_", ndims, "_tsne")) +
ggplot2::scale_color_manual(values = sample_info$color,
breaks = sample_info$project_name) +
Seurat::NoAxes() + ggplot2::ggtitle("PCA - harmony - tSNE") +
ggplot2::theme(aspect.ratio = 1,
plot.title = element_text(hjust = 0.5),
legend.position = "none")
umap = Seurat::DimPlot(sobj, group.by = "project_name",
reduction = paste0("harmony_", ndims, "_umap")) +
ggplot2::scale_color_manual(values = sample_info$color,
breaks = sample_info$project_name) +
Seurat::NoAxes() + ggplot2::ggtitle("PCA - harmony - UMAP") +
ggplot2::theme(aspect.ratio = 1,
plot.title = element_text(hjust = 0.5))
tsne | umap
We will keep the tSNE from harmony :
reduction = "harmony"
name2D = paste0("harmony_", ndims, "_tsne")
We generate a clustering :
sobj = Seurat::FindNeighbors(sobj, reduction = reduction, dims = 1:ndims)
sobj = Seurat::FindClusters(sobj, resolution = 1.2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 35289
## Number of edges: 1441587
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8872
## Number of communities: 33
## Elapsed time: 8 seconds
clusters_plot = Seurat::DimPlot(sobj, reduction = name2D, label = TRUE) +
Seurat::NoAxes() + Seurat::NoLegend() +
ggplot2::labs(title = "Clusters ID") +
ggplot2::theme(aspect.ratio = 1,
plot.title = element_text(hjust = 0.5))
clusters_plot
We represent the 4 quality metrics :
plot_list = Seurat::FeaturePlot(sobj, reduction = name2D,
combine = FALSE, pt.size = 0.25,
features = c("percent.mt", "percent.rb", "log_nCount_RNA", "nFeature_RNA"))
plot_list = lapply(plot_list, FUN = function(one_plot) {
one_plot +
Seurat::NoAxes() +
ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
ggplot2::theme(aspect.ratio = 1)
})
patchwork::wrap_plots(plot_list, nrow = 1)
We visualize cell type :
plot_list = lapply((c(paste0("RNA_pca_", ndims, "_tsne"),
paste0("RNA_pca_", ndims, "_umap"),
paste0("harmony_", ndims, "_tsne"),
paste0("harmony_", ndims, "_umap"))),
FUN = function(one_red) {
Seurat::DimPlot(sobj, group.by = "cell_type",
reduction = one_red,
cols = color_markers) +
Seurat::NoAxes() + ggplot2::ggtitle(one_red) +
ggplot2::theme(aspect.ratio = 1,
plot.title = element_text(hjust = 0.5))
})
patchwork::wrap_plots(plot_list, nrow = 2) +
patchwork::plot_layout(guides = "collect")
We make a representation split by origin to show cell types :
plot_list = aquarius::plot_split_dimred(sobj,
reduction = name2D,
split_by = "project_name",
split_color = setNames(sample_info$color,
nm = sample_info$project_name),
group_by = "cell_type",
group_color = color_markers)
plot_list[[length(plot_list) + 1]] = patchwork::guide_area()
patchwork::wrap_plots(plot_list, ncol = 4) +
patchwork::plot_layout(guides = "collect") &
ggplot2::theme(legend.position = "right")
We can represent cell type split by laboratory, split by sample of origin :
plot_list = aquarius::plot_split_dimred(sobj,
reduction = name2D,
split_by = "laboratory",
group_by = "cell_type",
group_color = color_markers)
plot_list[[length(plot_list) + 1]] = patchwork::guide_area()
patchwork::wrap_plots(plot_list, nrow = 1) +
patchwork::plot_layout(guides = "collect") &
ggplot2::theme(legend.position = "right")
We can represent cell type split by location, split by sample of origin :
plot_list = aquarius::plot_split_dimred(sobj,
reduction = name2D,
split_by = "location",
group_by = "cell_type",
group_color = color_markers)
plot_list[[length(plot_list) + 1]] = patchwork::guide_area()
patchwork::wrap_plots(plot_list, nrow = 1) +
patchwork::plot_layout(guides = "collect") &
ggplot2::theme(legend.position = "right")
We can represent clusters, split by sample of origin :
plot_list = aquarius::plot_split_dimred(sobj,
reduction = name2D,
split_by = "project_name",
group_by = "seurat_clusters",
split_color = setNames(sample_info$color,
nm = sample_info$project_name),
group_color = aquarius::gg_color_hue(length(levels(sobj$seurat_clusters))))
plot_list[[length(plot_list) + 1]] = clusters_plot +
ggplot2::labs(title = "Cluster ID") &
ggplot2::theme(plot.title = element_text(hjust = 0.5, size = 15))
patchwork::wrap_plots(plot_list, ncol = 4) &
Seurat::NoLegend()